knitr::opts_chunk$set(echo = TRUE, cache = TRUE)
Transform CSV into binary Rdata files
input_folder = paste0(IO$input_data, "Days/")
output_folder = paste0(IO$tmp_data,"Days_Rdata_from_csv/")
if(!dir.exists(output_folder)){dir.create(output_folder)}
files = list.files(input_folder)
cl = makeCluster(par$n_cores)
registerDoParallel(cl)
users = foreach(file = files, .combine = c) %dopar%{
days = read.csv(paste0(input_folder,file), stringsAsFactors = FALSE)
# colnames
colnames(days)[colnames(days) == "account"] = "user_id"
# identifying
users_with_pos_preg_tests = unique(days$user_id[which(days$preg_test == 1)])
# formating
days$date = as.Date(days$date)
# transforming first_day into a logical
days$first_day_o = days$first_day
days$first_day = ifelse(days$first_day == "True", TRUE, FALSE)
new_file_name = gsub("csv","Rdata",file)
save(days, file = paste0(output_folder,new_file_name))
return(users_with_pos_preg_tests)
}
stopImplicitCluster()
save(users, file = paste0(IO$tmp_data, "full_list_users_with_pos_preg_tests.Rdata"))
Create a user table from the list of users that ever logged a positive pregnancy test
users = data.frame(user_id = as.character(unique(users)), pos_preg_test = TRUE, stringsAsFactors = FALSE)
users = users[order(users$user_id),]
batch_size = min(par$max_batch_size, ceiling(nrow(users)/par$min_n_batches))
n_batches = ceiling(nrow(users)/batch_size)
users$batch = rep(1:n_batches, each = batch_size)[1:nrow(users)]
save(users, file = paste0(IO$output_data, "users.Rdata"))
ok = file.copy(from = paste0(IO$output_data, "users.Rdata"), to = paste0(IO$tmp_data, "users_with_pos_preg_tests.Rdata"), overwrite = TRUE)
Filter the days table and re-organize users into batches
input_folder = paste0(IO$tmp_data,"Days_Rdata_from_csv/")
tmp_folder = paste0(IO$tmp_data,"Days_filtered_split_batches/")
if(!dir.exists(tmp_folder)){dir.create(tmp_folder)}
files = list.files(input_folder)
cl = makeCluster(par$n_cores)
registerDoParallel(cl)
ok = foreach(file = files) %dopar%{
load(paste0(input_folder,file), verbose = TRUE)
full_days = days
# filtering
full_days = full_days[full_days$user_id %in% users$user_id,]
full_days$input_file_id = file
# split by batches
for(b in unique(users$batch[users$user_id %in% full_days$user_id])){
days = full_days[full_days$user_id %in% users$user_id[users$batch == b],]
days$batch = b
save(days, file = paste0(tmp_folder,"batch_",b,"_",file))
}
}
stopImplicitCluster()
input_folder = paste0(IO$tmp_data,"Days_filtered_split_batches/")
output_folder = paste0(IO$output_data,"Days/")
tmp_folder = paste0(IO$tmp_data, "Days_filtered/")
if(dir.exists(input_folder)){unlink(output_folder, recursive = TRUE);dir.create(output_folder)}
if(!dir.exists(tmp_folder)){dir.create(tmp_folder)}
files = list.files(input_folder)
input_files = foreach(b = unique(users$batch), .combine = rbind) %do%{
cl = makeCluster(par$n_cores)
registerDoParallel(cl)
batch_files = files[grep(paste0("batch_",b,"_day"), files)]
days = foreach(file = batch_files, .combine = rbind) %dopar%{
load(paste0(input_folder,file), verbose = TRUE)
return(days)
}
stopImplicitCluster()
save(days, file = paste0(output_folder,"days_",b,".Rdata"))
file.copy(from = paste0(output_folder,"days_",b,".Rdata"), to = paste0(tmp_folder,"days_",b,".Rdata"), overwrite = TRUE)
input_files = aggregate(input_file_id ~ user_id, days, function(x){paste0(unique(sort(x)),collapse = "|")})
return(input_files)
}
save(input_files, file = paste0(IO$tmp_data, "input_files.Rdata"))
input_files = aggregate(input_file_id ~ user_id, input_files, function(x){paste0(unique(sort(x)),collapse = "|")})
users$input_files = input_files$input_file_id[match(users$user_id, input_files$user_id)]
save(users, file = paste0(IO$output_data,"users.Rdata"))
file.copy(from = paste0(IO$output_data,"users.Rdata"), to = paste0(IO$tmp_data,"users_with_original_file_id.Rdata"), overwrite = TRUE)
## [1] TRUE
Note: we cannot use the cycles table that Kindara provided because the account ids are not linked between the 3 tables.
# load(paste0(IO$output_data,"users.Rdata"), verbose = TRUE)
cycles_input_folder = paste0(IO$input_data,"Cycles/")
cycles_files = list.files(cycles_input_folder)
cycles = foreach(file = cycles_files) %dopar%
{
file = cycles_files[3]
cycles = read.csv(paste0(cycles_input_folder,file), stringsAsFactors = FALSE)
dim(cycles)
colnames(cycles)
cycles$user_id = cycles$account
j = which(cycles$user_id %in% users$user_id)
length(j)
return(cycles[j,])
}
dim(cycles)
We thus need to create the cycles from the days table by looking at which days have the flag first_day.
We will take this opportunity of going through the days table to
clean some columns (transform first_day into a logical)
remove duplicated rows in the days table
# load(paste0(IO$output_data,"users.Rdata"), verbose = TRUE)
days_input_folder = paste0(IO$output_data,"Days/")
days_files = list.files(days_input_folder)
cycles = foreach(file = days_files, .combine = rbind) %dopar%
{
load(paste0(days_input_folder,file), verbose = TRUE)
colnames(days)
dim(days)
# checking for duplicated rows
d = duplicated(days)
j = which(d)
if(length(j)>0){
days = days[-j,]
}
# creating the cycles table
cycles = days[days$first_day, c("user_id","date")]
colnames(cycles)[which(colnames(cycles) == "date")] = "start_date"
cycles = cycles[order(cycles$user_id, cycles$start_date),]
j = which(cycles$user_id %in% users$user_id)
length(j)
cycles = cycles[j,]
return(cycles)
}
dim(cycles)
## [1] 83477 2
save(cycles, file = paste0(IO$output_data,"cycles.Rdata"))
file.copy(from = paste0(IO$output_data,"cycles.Rdata"), to = paste0(IO$tmp_data,"cycles_first_version.Rdata"), overwrite = TRUE)
## [1] TRUE
We create unique cycle ID in the cycle table
# load(paste0(IO$output_data,"users.Rdata"), verbose = TRUE)
cycles = cycles[order(cycles$user_id, cycles$start_date),]
cycles$cycle_nb = ave(rep(1, nrow(cycles)), cycles$user_id, FUN = cumsum)
cycles$cycle_id = paste0(cycles$user_id, "_" ,cycles$cycle_nb)
cycles$end_date = cycles$start_date[match(cycles$cycle_id, paste0(cycles$user_id,"_",cycles$cycle_nb-1))] - 1
cycles$cycle_length = as.numeric(cycles$end_date - cycles$start_date + 1)
save(cycles, file = paste0(IO$output_data,"cycles.Rdata"))
file.copy(from = paste0(IO$output_data,"cycles.Rdata"), to = paste0(IO$tmp_data,"cycles_with_nb_and_id.Rdata"), overwrite = TRUE)
## [1] TRUE
And associate each row of the days to a cycle
days_folder = paste0(IO$output_data,"Days/")
days_tmp_folder = paste0(IO$tmp_data,"Days_with_cycle_id/")
if(!dir.exists(days_tmp_folder)){dir.create(days_tmp_folder)}
cl = makeCluster(par$n_cores)
registerDoParallel(cl)
days_files = list.files(days_folder)
ok = foreach(file = days_files) %dopar%
{
load(paste0(days_folder,file), verbose = TRUE)
colnames(days)
dim(days)
# take the part of cycles that matches with the days users
j = which((cycles$user_id %in% unique(days$user_id)) & (!is.na(cycles$cycle_length)))
cycles_sub = cycles[j,]
# expand cycles for each day
cycles_sub_exp = as.data.frame(lapply(cycles_sub, rep, cycles_sub$cycle_length))
cycles_sub_exp$cycleday = ave(rep(1,nrow(cycles_sub_exp)), cycles_sub_exp$cycle_id, FUN =cumsum)
cycles_sub_exp$date = cycles_sub_exp$start_date + (cycles_sub_exp$cycleday - 1)
cycles_sub_exp$day_id = paste0(cycles_sub_exp$user_id, "_", cycles_sub_exp$date)
# match days and cycles_sub_exp
days$day_id = paste0(days$user_id, "_", days$date)
m = match(days$day_id, cycles_sub_exp$day_id)
days$cycle_nb = cycles_sub_exp$cycle_nb[m]
days$cycle_id = cycles_sub_exp$cycle_id[m]
days$cycle_length = cycles_sub_exp$cycle_length[m]
days$cycleday = cycles_sub_exp$cycleday[m]
days$cycleday_from_end = days$cycleday - days$cycle_length - 1
#d = duplicated(days[,c("user_id","date","cycleday")])
#if(sum(d)>0){days = days[-which(d),]}
save(days, file = paste0(days_folder, file))
file.copy(from = paste0(days_folder, file), to = paste0(days_tmp_folder, file), overwrite = TRUE)
}
stopImplicitCluster()
Now we can aggregate the days table to report useful information on the cycles table
aggregate to create the cycles table
input_days_folder = paste0(IO$tmp_data,"Days_with_cycle_id/")
output_days_folder = paste0(IO$output_data,"Days/")
cl = makeCluster(par$n_cores)
registerDoParallel(cl)
days_files = list.files(input_days_folder)
cycles_agg = foreach(file = days_files, .combine = rbind, .packages = c('plyr','dplyr')) %dopar%
{
load(paste0(input_days_folder,file), verbose = TRUE)
colnames(days)
dim(days)
# take this oppotunity to change the way preg tests are encoded
days$preg_test_o = days$preg_test
days$preg_test[days$preg_test == 2] = -1
save(days, file = paste0(output_days_folder,file))
#
cycles_agg = ddply(days,
.(cycle_id),
.parallel = TRUE, # FALSE, # TRUE
.fun = summarize,
cycle_length = min(cycle_length),
n_days_obs = lu(date),
last_obs_day = max(cycleday),
n_pos_preg_test = sum(preg_test == 1),
n_neg_preg_test = sum(preg_test == -1),
day_first_pos_preg_test = min( cycleday_from_end * (preg_test == 1), na.rm = TRUE),
day_last_pos_preg_test = max(cycleday * (preg_test == 1), na.rm = TRUE),
day_last_preg_test = max(cycleday * (preg_test %in% c(1,-1)), na.rm = TRUE),
n_tot_sex = sum(sex > 0, na.rm = TRUE),
n_prot_sex = sum(sex == 1, na.rm = TRUE),
n_unprot_sex = sum(sex == 2, na.rm = TRUE),
n_withdrawal = sum(sex == 3, na.rm = TRUE),
n_insemination = sum(sex == 4, na.rm = TRUE),
n_BBT = sum(!is.na(temperature), na.rm = TRUE))
j = which(cycles_agg$day_first_pos_preg_test < 0)
cycles_agg$day_first_pos_preg_test[j] = cycles_agg$cycle_length[j] + cycles_agg$day_first_pos_preg_test[j] + 1
cycles_agg$n_pos_preg_test[is.na(cycles_agg$n_pos_preg_test)] = 0
cycles_agg$n_neg_preg_test[is.na(cycles_agg$n_neg_preg_test)] = 0
cycles_agg$day_first_pos_preg_test[is.infinite(cycles_agg$day_first_pos_preg_test)] = 0
cycles_agg$day_last_pos_preg_test[is.infinite(cycles_agg$day_last_pos_preg_test)] = 0
# n_days_obs_after_first_pos_preg_test
days$day_first_pos_preg_test = cycles_agg$day_first_pos_preg_test[match(days$cycle_id, cycles_agg$cycle_id)]
days$after_first_pos_preg_test = (days$day_first_pos_preg_test > 0) & (days$cycleday > days$day_first_pos_preg_test)
cycles_agg2 = aggregate(after_first_pos_preg_test ~ cycle_id, days, sum, na.rm = TRUE )
cycles_agg$n_days_obs_after_first_pos_preg_test = cycles_agg2$after_first_pos_preg_test[match(cycles_agg$cycle_id, cycles_agg2$cycle_id)]
# last_preg_test
days$day_last_preg_test = cycles_agg$day_last_preg_test[match(days$cycle_id, cycles_agg$cycle_id)]
cycles_agg2 = days[which(days$cycleday == days$day_last_preg_test),]
cycles_agg$last_preg_test = cycles_agg2$preg_test[match(cycles_agg$cycle_id, cycles_agg2$cycle_id)]
cycles_agg$last_preg_test[is.na(cycles_agg$last_preg_test)]= 0
# preg_test_class
#cycles_agg$preg_test_class = ifelse(cycles_agg$n_pos_preg_test>0,ifelse(cycles_agg$last_preg_test == 1, "pregnant","pregnancy loss"), ifelse(cycles_agg$n_neg_preg_test>0,"not pregnant", "not tested"))
cycles_agg$preg_test_class = ifelse(cycles_agg$n_pos_preg_test>0,"pregnant", ifelse(cycles_agg$n_neg_preg_test>0,"not pregnant", "not tested"))
return(cycles_agg)
}
stopImplicitCluster()
save(cycles_agg, file = paste0(IO$tmp_data, "cycles_agg.Rdata"))
column_names = colnames(cycles_agg)
column_names = column_names[-which(column_names %in% colnames(cycles))]
m = match(cycles$cycle_id, cycles_agg$cycle_id)
for(column in column_names){
eval(parse(text = paste0("cycles$",column,"= cycles_agg$",column,"[m]")))
#eval(parse(text = paste0("cycles$",column,"[is.na(cycles$",column,")]= 0")))
}
save(cycles, file = paste0(IO$output_data,"cycles.Rdata"))
file.copy(from = paste0(IO$output_data,"cycles.Rdata"), to = paste0(IO$tmp_data,"cycles_with_agg.Rdata"), overwrite = TRUE)
## [1] TRUE
#load(paste0(IO$tmp_data,"users_with_original_file_id.Rdata"),verbose = TRUE)
users_agg = suppressWarnings(
ddply(cycles,
.(user_id),
.fun = summarize,
n_cycles = max(cycle_nb, na.rm = TRUE),
n_days_obs = sum(n_days_obs, na.rm = TRUE),
n_pos_cycles = sum(n_pos_preg_test > 0, na.rm = TRUE),
first_cycle_preg = min(cycle_nb[n_pos_preg_test > 0], na.rm = TRUE))
)
users_agg$first_cycle_preg[is.infinite(users_agg$first_cycle_preg)] = 0
# n_obs_after_first_preg
cycles_tmp = cycles
cycles_tmp$first_cycle_preg = users_agg$first_cycle_preg[match(cycles_tmp$user_id, users_agg$user_id)]
users_agg2 = aggregate(n_days_obs ~ user_id, cycles_tmp[cycles_tmp$cycle_nb > cycles_tmp$first_cycle_preg, ], sum, na.rm = TRUE)
users_agg$n_days_obs_after_first_preg = users_agg2$n_days_obs[match(users_agg$user_id, users_agg2$user_id)]
users_agg$n_days_obs_after_first_preg [is.na(users_agg$n_days_obs_after_first_preg )] = 0
users_agg$n_cycles_after_first_preg = users_agg$n_cycles - users_agg$first_cycle_preg
# minimal cycle length before the first positive preg test
users_agg2 = aggregate(cycle_length ~ user_id,
cycles_tmp[cycles_tmp$cycle_nb < cycles_tmp$first_cycle_preg, ],
min, na.rm = TRUE)
users_agg$shortest_cycle_before_first_pos_preg = users_agg2$cycle_length[match(users_agg$user_id, users_agg2$user_id)]
# adding new columns to the users table
column_names = colnames(users_agg)
column_names = column_names[-which(column_names %in% colnames(users))]
m = match(users$user_id, users_agg$user_id)
for(column in column_names){
eval(parse(text = paste0("users$",column,"= users_agg$",column,"[m]")))
}
save(users, file = paste0(IO$output_data,"users.Rdata"))
file.copy(from = paste0(IO$output_data,"users.Rdata"), to = paste0(IO$tmp_data,"users_with_agg.Rdata"), overwrite = TRUE)
## [1] TRUE
j = which(
(users$first_cycle_preg >= 5)&
(users$shortest_cycle_before_first_pos_preg > 15) &
((users$n_cycles_after_first_preg >=2) | (users$n_days_obs_after_first_preg >= 20)))
# users
users = users[j,]
save(users, file = paste0(IO$output_data,"users.Rdata"))
file.copy(from = paste0(IO$output_data,"users.Rdata"), to = paste0(IO$tmp_data,"users_filtered.Rdata"), overwrite = TRUE)
## [1] TRUE
# cycles
cycles = cycles[which(cycles$user_id %in% users$user_id),]
save(cycles, file = paste0(IO$output_data,"cycles.Rdata"))
file.copy(from = paste0(IO$output_data,"cycles.Rdata"), to = paste0(IO$tmp_data,"cycles_filtered.Rdata"), overwrite = TRUE)
## [1] TRUE
# days
days_folder = paste0(IO$output_data,"Days/")
cl = makeCluster(par$n_cores)
registerDoParallel(cl)
days_files = list.files(days_folder)
days = foreach(file = days_files, .combine = rbind) %dopar%
{
load(paste0(days_folder,file), verbose = TRUE)
colnames(days)
dim(days)
j = which(days$user_id %in% users$user_id)
days = days[j,]
return(days)
}
stopImplicitCluster()
save(days, file = paste0(IO$output_data,"days.Rdata"))
file.copy(from = paste0(IO$output_data,"days.Rdata"), to = paste0(IO$tmp_data,"days_filtered.Rdata"), overwrite = TRUE)
## [1] TRUE
aggregate
#load(paste0(IO$tmp_data,"users_with_original_file_id.Rdata"),verbose = TRUE)
cycles_tmp = cycles_tmp[cycles_tmp$user_id %in% users$user_id,]
users_agg = suppressWarnings(
ddply(cycles_tmp,
.(user_id),
.fun = summarize,
cycle_length_no_preg_avg = mean(cycle_length[n_pos_preg_test == 0], na.rm = TRUE),
cycle_length_no_preg_median = median(cycle_length[n_pos_preg_test == 0], na.rm = TRUE),
cycle_length_no_preg_sd = sd(cycle_length[n_pos_preg_test == 0], na.rm = TRUE),
cycle_length_before_preg_avg = mean(cycle_length[cycle_nb < first_cycle_preg], na.rm = TRUE),
cycle_length_before_preg_median = median(cycle_length[cycle_nb < first_cycle_preg], na.rm = TRUE),
cycle_length_before_preg_sd = sd(cycle_length[cycle_nb < first_cycle_preg], na.rm = TRUE))
)
column_names = colnames(users_agg)
column_names = column_names[-which(column_names %in% colnames(users))]
m = match(users$user_id, users_agg$user_id)
for(column in column_names){
eval(parse(text = paste0("users$",column,"= users_agg$",column,"[m]")))
}
save(users, file = paste0(IO$output_data,"users.Rdata"))
file.copy(from = paste0(IO$output_data,"users.Rdata"), to = paste0(IO$tmp_data,"users_with_cycle_length_stats.Rdata"), overwrite = TRUE)
## [1] TRUE
knitr::opts_chunk$set(echo = TRUE, cache = TRUE)
for cycles with positive pregnancy tests, with only negative pregnancy tests and with no pregnancy tests.
thresholds for EPL (Early Pregnancy loss), LPL_P (Late Pregnancy loss or premature birth), LB (live birth) and BF (breast feeding)
load(paste0(IO$output_data, "users.Rdata"), verbose = TRUE)
## Loading objects:
## users
load(paste0(IO$output_data, "cycles.Rdata"), verbose = TRUE)
## Loading objects:
## cycles
load(paste0(IO$output_data, "days.Rdata"), verbose = TRUE)
## Loading objects:
## days
g = ggplot(cycles, aes(x = cycle_length, fill = preg_test_class)) +
geom_histogram(aes(y = ..density..), binwidth = 1, position = "identity")+
xlim(c(0,750))+
facet_grid(preg_test_class ~ .)
g
## Warning: Removed 1502 rows containing non-finite values (stat_bin).
## Warning: Removed 6 rows containing missing values (geom_bar).
g = ggplot(cycles[cycles$preg_test_class != "pregnant",], aes(x = cycle_length, fill = preg_test_class)) +
geom_histogram(aes(y = ..density..), binwidth = 1, position = "identity", alpha = 0.5)+
xlim(c(0,150))
#facet_grid(preg_test_class ~ .)
g
## Warning: Removed 1735 rows containing non-finite values (stat_bin).
## Warning: Removed 4 rows containing missing values (geom_bar).
g_hist_lt = ggplot(cycles, aes(x = cycle_length, fill = preg_test_class)) +
geom_vline(xintercept = dict$pregnancy_timeline$duration_in_days, col = "gray", linetype = 2)+
geom_histogram(aes(y = ..density..), binwidth = 7, position = "identity", alpha = 0.5)+
scale_x_continuous(breaks = viz$xaxis_m*28, minor_breaks = viz$xaxis_s*7, labels = viz$xaxis_m*4, limits = c(0,20*28))+
ylab("% of cycles")+ xlab("cycle or pregnancy duration (in weeks)")+
scale_fill_discrete(name = "Cycle label")+ theme(legend.position = "bottom")
g_hist_lt
## Warning: Removed 1672 rows containing non-finite values (stat_bin).
## Warning: Removed 6 rows containing missing values (geom_bar).
g_hist_st = ggplot(cycles, aes(x = cycle_length, fill = preg_test_class)) +
geom_vline(xintercept = dict$pregnancy_timeline$duration_in_days, col = "gray", linetype = 2)+
geom_histogram(aes(y = ..density..), binwidth = 1, position = "identity", alpha = 0.5)+
scale_x_continuous(breaks = viz$xaxis_m*28, minor_breaks = viz$xaxis_s*7, labels = viz$xaxis_m*4, limits = c(0,150))+
ylab("% of cycles")+ xlab("cycle or pregnancy duration (in weeks)")+
guides(fill = FALSE)
#facet_grid(preg_test_class ~ .)
g_hist_st
## Warning: Removed 2572 rows containing non-finite values (stat_bin).
## Warning: Removed 5 rows containing missing values (geom_vline).
## Warning: Removed 6 rows containing missing values (geom_bar).
g_inset = ggplotGrob(g_hist_st +
theme(plot.background = element_rect(colour = "gray40")))
## Warning: Removed 2572 rows containing non-finite values (stat_bin).
## Warning: Removed 5 rows containing missing values (geom_vline).
## Warning: Removed 6 rows containing missing values (geom_bar).
g_hist_lt + annotation_custom(
grob = g_inset,
xmin = 24*7,
xmax = Inf,
ymin = 0.03,
ymax = Inf
)
## Warning: Removed 1672 rows containing non-finite values (stat_bin).
## Warning: Removed 6 rows containing missing values (geom_bar).
cycles$preg_outcome = factor(cycles$preg_test_class,
levels = c(dict$pregnancy_timeline$abbreviation,
unique(cycles$preg_test_class)))
j = which(cycles$preg_test_class == "pregnant")
cycles$preg_outcome[j] = cut(cycles$cycle_length[j],
breaks = c(0,dict$pregnancy_timeline$duration_in_days),
labels = as.character(dict$pregnancy_timeline$abbreviation))
counts = table(cycles$preg_outcome[j])
table(cycles$preg_outcome[j])/sum(cycles$preg_outcome %in% c("EPL","LPL","ExPTB","PTB","TB","BF"))
##
## FP|VEPL EPL LPL ExPTB PTB
## 0.37703583 0.34690554 0.12540717 0.01547231 0.01710098
## TB BF unclear not tested pregnant
## 0.11156352 0.38355049 0.15228013 0.00000000 0.00000000
## not pregnant
## 0.00000000
j = which(cycles$preg_test_class == "pregnant")
ggplot(cycles[j,], aes(x = preg_outcome, fill = preg_outcome)) +
geom_bar(aes(y = (..count..)/sum(..count..)))+
xlab("Pregnancy outcome")+ ylab("% cycles")+
scale_fill_manual(values = dict$pregnancy_timeline$colors)+
scale_y_continuous(labels=percent)+
guides(fill = FALSE)
save(cycles, file = paste0(IO$output_data, "cycles.Rdata"))
file.copy(from = paste0(IO$output_data, "cycles.Rdata"), to = paste0(IO$tmp_data, "cycles_with_preg_outcome.Rdata"), overwrite = TRUE)
## [1] TRUE
users_preg_outcome = ddply(cycles,
.(user_id),
.fun = summarize,
n_preg = sum(preg_test_class == "pregnant", na.rm = TRUE),
n_PL = sum(preg_outcome %in% c("EPL","LPL"), na.rm = TRUE),
n_LB = sum(preg_outcome %in% c("ExPTB","PTB","TB","BF"), na.rm = TRUE)
)
table(users_preg_outcome$n_LB, users_preg_outcome$n_PL)
##
## 0 1 2 3
## 0 412 395 33 4
## 1 496 86 9 0
## 2 24 3 0 0
## 3 1 0 0 0
table(users_preg_outcome$n_LB)
##
## 0 1 2 3
## 844 591 27 1
table(users_preg_outcome$n_PL)
##
## 0 1 2 3
## 933 484 42 4
j = which((users_preg_outcome$n_LB + users_preg_outcome$n_PL)>0)
table(users_preg_outcome$n_LB[j], users_preg_outcome$n_PL[j])
##
## 0 1 2 3
## 0 0 395 33 4
## 1 496 86 9 0
## 2 24 3 0 0
## 3 1 0 0 0
table(users_preg_outcome$n_LB[j])
##
## 0 1 2 3
## 432 591 27 1
round(table(users_preg_outcome$n_LB[j])/sum(table(users_preg_outcome$n_LB[j])) * 100, 2)
##
## 0 1 2 3
## 41.10 56.23 2.57 0.10
table(users_preg_outcome$n_PL[j])
##
## 0 1 2 3
## 521 484 42 4
round(table(users_preg_outcome$n_PL[j])/sum(table(users_preg_outcome$n_PL[j])) * 100, 2)
##
## 0 1 2 3
## 49.57 46.05 4.00 0.38
dim(users)
## [1] 1463 17
dim(users_preg_outcome)
## [1] 1463 4
colnames = colnames(users_preg_outcome[,2:ncol(users_preg_outcome)])
m = match(users$user_id, users_preg_outcome$user_id)
for(colname in colnames){
eval(parse(text = paste0("users$",colname," = users_preg_outcome$",colname,"[m]")))
}
save(users, file = paste0(IO$output_data, "users.Rdata"))
file.copy(from = paste0(IO$output_data, "users.Rdata"), to = paste0(IO$tmp_data, "users_with_preg_outcome.Rdata"), overwrite = TRUE)
## [1] TRUE
load(file = paste0(IO$output_data,"users.Rdata"), verbose = TRUE)
## Loading objects:
## users
users$any_PL = ifelse(users$n_PL>0, TRUE, ifelse(users$n_LB>0,FALSE,NA))
u = users[which(!is.na(users$any_PL)),]
glm_cl = glm(
any_PL ~ cycle_length_before_preg_avg +
cycle_length_before_preg_median +
cycle_length_before_preg_sd,
data = u,
family = "binomial")
summary(glm_cl)
##
## Call:
## glm(formula = any_PL ~ cycle_length_before_preg_avg + cycle_length_before_preg_median +
## cycle_length_before_preg_sd, family = "binomial", data = u)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8277 -1.1627 0.8129 1.1861 1.3780
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.722218 0.361196 -2.000 0.0456 *
## cycle_length_before_preg_avg 0.015717 0.016355 0.961 0.3365
## cycle_length_before_preg_median 0.008166 0.018043 0.453 0.6509
## cycle_length_before_preg_sd -0.004157 0.007254 -0.573 0.5666
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1456.9 on 1050 degrees of freedom
## Residual deviance: 1449.3 on 1047 degrees of freedom
## AIC: 1457.3
##
## Number of Fisher Scoring iterations: 4
glm_cl = glm(
any_PL ~ cycle_length_before_preg_median,
data = u,
family = "binomial")
summary(glm_cl)
##
## Call:
## glm(formula = any_PL ~ cycle_length_before_preg_median, family = "binomial",
## data = u)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9070 -1.1705 0.9551 1.1843 1.2463
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.71296 0.35639 -2.001 0.0454 *
## cycle_length_before_preg_median 0.02402 0.01157 2.076 0.0379 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1456.9 on 1050 degrees of freedom
## Residual deviance: 1452.3 on 1049 degrees of freedom
## AIC: 1456.3
##
## Number of Fisher Scoring iterations: 4
glm_cl = glm(
any_PL ~ cycle_length_before_preg_sd,
data = u,
family = "binomial")
summary(glm_cl)
##
## Call:
## glm(formula = any_PL ~ cycle_length_before_preg_sd, family = "binomial",
## data = u)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5157 -1.1709 0.8439 1.1832 1.1874
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.024834 0.066305 -0.375 0.7080
## cycle_length_before_preg_sd 0.003201 0.001897 1.688 0.0914 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1456.9 on 1050 degrees of freedom
## Residual deviance: 1453.8 on 1049 degrees of freedom
## AIC: 1457.8
##
## Number of Fisher Scoring iterations: 4
ggplot(u, aes(x = cycle_length_before_preg_avg, fill = any_PL))+
geom_histogram(col = NA, alpha = 0.5, position = "identity", binwidth = 1)
ggplot(u, aes(x = cycle_length_before_preg_median, fill = any_PL))+
geom_histogram(col = NA, alpha = 0.5, position = "identity", binwidth = 1)
ggplot(u, aes(x = cycle_length_before_preg_median, fill = any_PL))+
geom_density(col = NA, alpha = 0.5, bw = 2)
ggplot(u, aes(x = log(cycle_length_before_preg_sd), fill = any_PL))+
geom_histogram(col = NA, alpha = 0.5, position = "identity", binwidth = 1)
## Warning: Removed 2 rows containing non-finite values (stat_bin).
Use HMM model (adapt it to include prior knowledge and LH tests)
Label days table
Aggregate cycles table with
ovulation day + uncertainty
temperature shift
Aggregate users table with
Loading users and days tables.
load(file = paste0(IO$output_data,"users.Rdata"), verbose = TRUE)
## Loading objects:
## users
load(file = paste0(IO$output_data,"days.Rdata"), verbose = TRUE)
## Loading objects:
## days
j = which((users$n_PL == 1) & (users$n_preg == 1) & (users$n_cycles %in% 15:20) & (users$n_days_obs %in% 300:400))
for(user in users$user_id[j]){
cat("\n",user, "\n")
d = days[which((days$user_id == user) & (days$cycle_nb >= 2)),]
plot.tracking.history(d = d, show_tests = TRUE, average_temp = FALSE)
}
##
## 05def79e901d45508c67bcfd44a33ddd
##
## 507abdf43cdd491abdb3a32e65923928
##
## 6e55b1daeae34508b9f231f2b873f537
##
## 70d07733a9884416b056b5f819be9094
##
## 9904052a9836407abf2f569e157c1a42
##
## aa1caffaa60c4a6086cd3db9bfecd5bd
##
## aa6193a3ce84413196177052fe87f493
##
## e66dd8871c6b442b927a07d73959bf4e
user_ids = c("70d07733a9884416b056b5f819be9094",
"aa1caffaa60c4a6086cd3db9bfecd5bd",
"aa6193a3ce84413196177052fe87f493")
j = which((users$n_LB == 1) & (users$n_preg == 1) & (users$n_cycles %in% 15:20) & (users$n_days_obs %in% 300:400))
for(user in users$user_id[j]){
cat("\n",user, "\n")
d = days[which((days$user_id == user) & (days$cycle_nb >= 2)),]
plot.tracking.history(d = d, show_tests = TRUE, average_temp = FALSE)
}
##
## 04251d0121874e29bf52d1428b67efa7
##
## 1552e3fc56dc48ba977d69ed2a987841
##
## 215b7ff9f2ed494cb7a17d4d4476270d
##
## 3b4d245e51b94be4ac1fdbe0251ba1b1
##
## 3b8f6fb7c38545fda33c5461bba340e5
##
## 3ee8da51d47e4c178f739450b57d69f2
##
## 49e5591bfe344e68aefda28a19bf792c
##
## 5524de1daa6e4dc598e2c9fb32ff0f48
##
## 6fa2b5df2c074a7da88a01207bed4352
##
## 78b74e2cdfc2417096ad5f9d56132305
##
## 7fcc75a22714449286218b08e69baeeb
##
## a43a65171a604f9fab859ce5c9ce2168
##
## c170dd91b6004d8283bd715941655019
##
## ce4c161eba2449d8a64aa3cba78e0bbb
##
## d9979a69c1bd4cd2956d649cac6cdb61
user_ids = c(user_ids,
"1552e3fc56dc48ba977d69ed2a987841",
"3b8f6fb7c38545fda33c5461bba340e5",
"3ee8da51d47e4c178f739450b57d69f2",
"6fa2b5df2c074a7da88a01207bed4352",
"c170dd91b6004d8283bd715941655019")
for(user in user_ids){
cat("\n",user, "\n")
d = days[which((days$user_id == user) & (days$cycle_nb >= 2)),]
plot.tracking.history(d = d, show_tests = TRUE, average_temp = FALSE)
}
##
## 70d07733a9884416b056b5f819be9094
##
## aa1caffaa60c4a6086cd3db9bfecd5bd
##
## aa6193a3ce84413196177052fe87f493
##
## 1552e3fc56dc48ba977d69ed2a987841
##
## 3b8f6fb7c38545fda33c5461bba340e5
##
## 3ee8da51d47e4c178f739450b57d69f2
##
## 6fa2b5df2c074a7da88a01207bed4352
##
## c170dd91b6004d8283bd715941655019
user_ids = c("70d07733a9884416b056b5f819be9094",
"c170dd91b6004d8283bd715941655019",
"3b8f6fb7c38545fda33c5461bba340e5")
user_ids = c("70d07733a9884416b056b5f819be9094",
"c170dd91b6004d8283bd715941655019",
"3b8f6fb7c38545fda33c5461bba340e5")
days = days[which(days$user_id %in% user_ids),]
save(days, file = paste0(IO$tmp_data,"days_selected_users.Rdata"))